if (!require("ISLR2")) install.packages("ISLR2")
## Loading required package: ISLR2
if (!require("cluster")) install.packages("cluster")
## Loading required package: cluster
if (!require("ggdendro")) install.packages("ggdendro")
## Loading required package: ggdendro
if (!require("factoextra")) install.packages("factoextra")
## Loading required package: factoextra
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(tibble)
library(cluster)
library(tidyr)
library(factoextra)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
sum(is.na(df))
## [1] 24
There are 24 NULL values in our data we will examine those as we go along
df[duplicated(df)]
There are no duplicate rows
df <- df %>%
select(-ID)
ggplot(df, aes(x=Year_Birth))+
geom_histogram(color = "grey", fill = "#1f77b4", bins = 30)+
labs(x = "Year of Birth",
y = "count",
title = "Distribution of Birth Year")+
theme_minimal()
df %>%
filter(Year_Birth < 1930)
seems like they are erroneous entries
ggplot(df, aes(Marital_Status)) +
geom_bar(fill = "#1f77b7", alpha = 0.8) +
labs( x= "Marital Status",
y = "count",
title = "Frequency plot for marital status")+
theme_minimal()
ggplot(df, aes(x = Income)) +
geom_histogram(fill = "#1f77b7", alpha = 0.8)+
labs(x = "Income",
y = "Count",
title = "Distribution of Income")+
scale_x_continuous(breaks = seq(5000, 100000, by= 40000))+
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 24 rows containing non-finite values (`stat_bin()`).
There is an outlier in data where we see a very large income, to see the distribution clearly lets filter our data
df %>%
filter(Income < 500000) %>%
ggplot(aes(x = Income)) +
geom_histogram(fill = "#1f77b7", alpha = 0.8)+
labs(x = "Income",
y = "Count",
title = "Distribution of Income")+
scale_x_continuous(breaks = seq(5000, 200000, by= 20000))+
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
There are few data points with income greater than 85000 lets call them high income group while rest looks to in the range 19000 - 70000
ggplot(df, aes(x = "", y = Income)) +
geom_violin(fill = "#69b3a2", color = "#e9ecef", alpha = 0.8)+
coord_flip()+
scale_y_continuous(breaks = seq(5000, 165000, by= 40000))+
labs(
x = "Income",
y = "Distribution",
title = "Income Distribution"
)+
theme_minimal(base_size = 20)
## Warning: Removed 24 rows containing non-finite values (`stat_ydensity()`).
df[!complete.cases(df),]
summary(df)
## Year_Birth Education Marital_Status Income
## Min. :1893 Length:2240 Length:2240 Min. : 1730
## 1st Qu.:1959 Class :character Class :character 1st Qu.: 35303
## Median :1970 Mode :character Mode :character Median : 51382
## Mean :1969 Mean : 52247
## 3rd Qu.:1977 3rd Qu.: 68522
## Max. :1996 Max. :666666
## NA's :24
## Kidhome Teenhome Dt_Customer Recency
## Min. :0.0000 Min. :0.0000 Length:2240 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.:0.0000 Class :character 1st Qu.:24.00
## Median :0.0000 Median :0.0000 Mode :character Median :49.00
## Mean :0.4442 Mean :0.5062 Mean :49.11
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:74.00
## Max. :2.0000 Max. :2.0000 Max. :99.00
##
## MntWines MntFruits MntMeatProducts MntFishProducts
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.00
## 1st Qu.: 23.75 1st Qu.: 1.0 1st Qu.: 16.0 1st Qu.: 3.00
## Median : 173.50 Median : 8.0 Median : 67.0 Median : 12.00
## Mean : 303.94 Mean : 26.3 Mean : 166.9 Mean : 37.53
## 3rd Qu.: 504.25 3rd Qu.: 33.0 3rd Qu.: 232.0 3rd Qu.: 50.00
## Max. :1493.00 Max. :199.0 Max. :1725.0 Max. :259.00
##
## MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 1.00 1st Qu.: 9.00 1st Qu.: 1.000 1st Qu.: 2.000
## Median : 8.00 Median : 24.00 Median : 2.000 Median : 4.000
## Mean : 27.06 Mean : 44.02 Mean : 2.325 Mean : 4.085
## 3rd Qu.: 33.00 3rd Qu.: 56.00 3rd Qu.: 3.000 3rd Qu.: 6.000
## Max. :263.00 Max. :362.00 Max. :15.000 Max. :27.000
##
## NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3
## Min. : 0.000 Min. : 0.00 Min. : 0.000 Min. :0.00000
## 1st Qu.: 0.000 1st Qu.: 3.00 1st Qu.: 3.000 1st Qu.:0.00000
## Median : 2.000 Median : 5.00 Median : 6.000 Median :0.00000
## Mean : 2.662 Mean : 5.79 Mean : 5.317 Mean :0.07277
## 3rd Qu.: 4.000 3rd Qu.: 8.00 3rd Qu.: 7.000 3rd Qu.:0.00000
## Max. :28.000 Max. :13.00 Max. :20.000 Max. :1.00000
##
## AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.07455 Mean :0.07277 Mean :0.06429 Mean :0.01339
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
##
## Complain Z_CostContact Z_Revenue Response
## Min. :0.000000 Min. :3 Min. :11 Min. :0.0000
## 1st Qu.:0.000000 1st Qu.:3 1st Qu.:11 1st Qu.:0.0000
## Median :0.000000 Median :3 Median :11 Median :0.0000
## Mean :0.009375 Mean :3 Mean :11 Mean :0.1491
## 3rd Qu.:0.000000 3rd Qu.:3 3rd Qu.:11 3rd Qu.:0.0000
## Max. :1.000000 Max. :3 Max. :11 Max. :1.0000
##
The missing values seems to have occurred at random as there are 24 missing values which is 1% of the total data, we can omit those values.
df <- na.omit(df)
sum(is.na(df))
## [1] 0
Formatting Date column
df %>%
select(Dt_Customer)
df<- df %>%
mutate(Dt_Customer = gsub("/", "-", Dt_Customer))
df<- df %>%
mutate(Dt_Customer = as.Date(Dt_Customer, format("%d-%m-%Y")))
summary(df$Dt_Customer)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## "2012-07-30" "2013-01-16" "2013-07-08" "2013-07-10" "2013-12-31" "2014-06-29"
Calculating Ages by taking the maximum Date
df <- df %>%
mutate(Age = 2014 - Year_Birth)
ggplot(df, aes(x = Age)) +
geom_histogram(fill = "#1f77b7", bins = 40, alpha = 0.8) +
labs(x = "Age",
y = "count",
title = "Distribution of Age")
df %>%
filter(Income != 666666) %>%
ggplot(aes(x = Age, y = Income) )+
geom_point(color = "#0072B2", size = 2)+
theme_minimal()
There is no any evident pattern
Removing Ouliter from data for Income and capping max age to 70
df<- df %>%
filter(Income != 666666) %>%
mutate(Age = ifelse(Age > 70, 70, Age))
df_product <- df[,c("MntWines","MntFruits", "MntMeatProducts", "MntFishProducts", "MntSweetProducts", "MntGoldProds")]
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.2.3
corr_mat_products <- cor(df_product)
ggcorrplot(corr_mat_products, hc.order = TRUE) +
theme(plot.title = element_text(hjust = 0.8)) +
geom_text(aes(label = value)) +
ggtitle("Correlation Plot for Product bought")
ggpairs(df_product) +
theme_minimal()
No significant relation Present between products
df_gateway <- df[,c("NumDealsPurchases", "NumStorePurchases", "NumWebPurchases", "NumCatalogPurchases", "NumWebVisitsMonth")]
corr_mat_gtwy <- cor(df_gateway)
ggcorrplot(corr_mat_gtwy, hc.order = TRUE) +
theme(plot.title = element_text(hjust = 0.8)) +
geom_text(aes(label = value)) +
ggtitle("Correlation Plot of Sample Data")
df_campaign <- df[,c("AcceptedCmp1", "AcceptedCmp2", "AcceptedCmp3", "AcceptedCmp4", "AcceptedCmp5")]
corr_mat_campaign <- cor(df_campaign)
ggpairs(df_gateway) +
theme_minimal()
No Significant Relation present
ggcorrplot(corr_mat_campaign, hc.order = TRUE) +
theme(plot.title = element_text(hjust = 0.8)) +
geom_text(aes(label = value)) +
ggtitle("Correlation Plot of Sample Data")
Creating variable Total Purchase which has all product purchased
df <- df %>%
mutate(Total_Purchaase = MntWines + MntFruits + MntMeatProducts + MntFishProducts + MntSweetProducts + MntSweetProducts + MntGoldProds)
df %>%
group_by(Marital_Status) %>%
summarise(Total_Purchase_by_Marital = mean(Total_Purchaase)) %>%
ggplot(aes(x = Marital_Status, y = Total_Purchase_by_Marital)) +
geom_col(fill = "#1f77b7") +
theme_minimal()
We see a graph equivalent to the proportion of the population so there is no particular group purchasing more.
Now we see across each product
df %>%
group_by(Marital_Status) %>%
summarise(Total_Purchase_by_Marital = sum(Total_Purchaase)) %>%
ggplot(aes(x = Marital_Status, y = Total_Purchase_by_Marital)) +
geom_col(fill = "#1f77b7") +
theme_minimal()
df %>%
group_by(Marital_Status) %>%
summarise(Wines = mean(MntWines), Fruits = mean(MntFruits), Meat = mean(MntMeatProducts), Fish = mean(MntFishProducts), Sweet = mean(MntSweetProducts), gold = mean(MntGoldProds)) %>%
ggplot(aes(x = Marital_Status)) +
geom_bar(aes(y = Wines, fill = "Wines"), stat = "identity", alpha = 0.7) +
geom_bar(aes(y = Fruits, fill = "Fruits"), stat = "identity", alpha = 0.7) +
geom_bar(aes(y = Meat, fill = "Meat"), stat = "identity", alpha = 0.7) +
geom_bar(aes(y = Fish, fill = "Fish"), stat = "identity", alpha = 0.7) +
geom_bar(aes(y = Sweet, fill = "Sweet"), stat = "identity", alpha = 0.7) +
geom_bar(aes(y = gold, fill = "Gold"), stat = "identity", alpha = 0.7) +
scale_fill_manual(values = c("Wines" = "#1F77B4", "Fruits" = "#FF7F0E", "Meat" = "#2CA02C", "Fish" = "#D62728", "Sweet" = "#9467BD", "Gold" = "#8C564B")) +
labs(title = "Average Spending on Product Categories by Marital Status",
x = "Marital Status",
y = "Average Spending",
fill = "Type of Product")+
theme_minimal() +
theme(legend.position = "right")
Wine is most common entity bought
df %>%
group_by(Income) %>%
summarise(Total_Purchase_by_Marital = mean(Total_Purchaase)) %>%
ggplot(aes(x = Income, y = Total_Purchase_by_Marital)) +
geom_point(color = "#1f77b7", size = 2) +
theme_minimal()
We see a non-linear relationship between Income and Total Purchase
df %>%
group_by(Income) %>%
summarise(Wines = mean(MntWines), Fruits = mean(MntFruits), Meat = mean(MntMeatProducts), Fish = mean(MntFishProducts), Sweet = mean(MntSweetProducts), gold = mean(MntGoldProds)) %>%
ggplot(aes(x = Income)) +
geom_point(aes(y = Wines, color = "Wines"), alpha = 0.7, size = 2) +
geom_point(aes(y = Fruits, color = "Fruits"), alpha = 0.7, size = 2) +
geom_point(aes(y = Meat, color = "Meat"), alpha = 0.7, size = 2) +
geom_point(aes(y = Fish, color = "Fish"), alpha = 0.7, size = 2) +
geom_point(aes(y = Sweet, color = "Sweet"), alpha = 0.7, size = 2) +
geom_point(aes(y = gold, color = "Gold Prods"), alpha = 0.7, size = 2) +
scale_color_manual(name = "Products", values = c("Wines" = "#E69F00", "Fruits" = "#56B4E9", "Meat" = "#009E73", "Fish" = "#0072B2", "Sweet" = "#D55E00", "Gold Prods" = "#CC79A7")) +
theme_minimal() +
theme(legend.position = "right")
Wine is the most popular followed by by meat for average income households.
df %>%
group_by(Age) %>%
summarise(Wines = mean(MntWines), Fruits = mean(MntFruits), Meat = mean(MntMeatProducts), Fish = mean(MntFishProducts), Sweet = mean(MntSweetProducts), gold = mean(MntGoldProds)) %>%
ggplot(aes(x = Age)) +
geom_point(aes(y = Wines, color = "Wines"), alpha = 0.7, size = 2) +
geom_point(aes(y = Fruits, color = "Fruits"), alpha = 0.7, size = 2) +
geom_point(aes(y = Meat, color = "Meat"), alpha = 0.7, size = 2) +
geom_point(aes(y = Fish, color = "Fish"), alpha = 0.7, size = 2) +
geom_point(aes(y = Sweet, color = "Sweet"), alpha = 0.7, size = 2) +
geom_point(aes(y = gold, color = "Gold Prods"), alpha = 0.7, size = 2) +
scale_color_manual(name = "Products", values = c("Wines" = "#E69F00", "Fruits" = "#56B4E9", "Meat" = "#009E73", "Fish" = "#0072B2", "Sweet" = "#D55E00", "Gold Prods" = "#CC79A7")) +
theme_minimal() +
theme(legend.position = "right")
Wine consumption increases over age. Whereas we see meat consumption beign high in early ages.
df %>%
group_by(Education) %>%
summarise(Wines = mean(MntWines), Fruits = mean(MntFruits), Meat = mean(MntMeatProducts), Fish = mean(MntFishProducts), Sweet = mean(MntSweetProducts), gold = mean(MntGoldProds)) %>%
ggplot(aes(x = Education)) +
geom_bar(aes(y = Wines, fill = "Wines"), stat = "identity", alpha = 0.7) +
geom_bar(aes(y = Fruits, fill = "Fruits"), stat = "identity", alpha = 0.7) +
geom_bar(aes(y = Meat, fill = "Meat"), stat = "identity", alpha = 0.7) +
geom_bar(aes(y = Fish, fill = "Fish"), stat = "identity", alpha = 0.7) +
geom_bar(aes(y = Sweet, fill = "Sweet"), stat = "identity", alpha = 0.7) +
geom_bar(aes(y = gold, fill = "Gold"), stat = "identity", alpha = 0.7) +
scale_fill_manual(values = c("Wines" = "#1F77B4", "Fruits" = "#FF7F0E", "Meat" = "#2CA02C", "Fish" = "#D62728", "Sweet" = "#9467BD", "Gold" = "#8C564B")) +
labs(title = "Average Spending on Product Categories by Education",
x = "Education",
y = "Average Spending",
fill = "Type of Product")+
theme_minimal() +
theme(legend.position = "right")
PhDs consume more wine also the fact they are older validates the the relation with age
We Create following features for Data Modelling
df %>%
select(Kidhome, Teenhome)
df <- df %>%
mutate(Is_Parent = ifelse(Kidhome + Teenhome > 0, 1, 0))
df %>%
ggplot(aes(x = Is_Parent, y = Total_Purchaase)) +
geom_col(fill = "#1f77b7")+
theme_minimal()
df %>%
ggplot(aes(Is_Parent) ) +
geom_bar(fill = "#1f77b7")+
theme_minimal()
We see parents who have kids have spent relative more given then proportion in data.
df <- df %>%
mutate(Education = case_when(
Education == "Basic" ~ "Undergraduate",
Education == "2n Cycle" ~ "Undergraduate",
Education == "Graduation" ~ "Graduate",
Education == "Master" ~ "Postgraduate",
Education == "PhD" ~ "Postgraduate",
TRUE ~ Education # Keep the original value if none of the above conditions match
))
df %>%
ggplot(aes(x = Education, y = Total_Purchaase)) +
geom_col(fill = "#1f77b7")+
theme_minimal()
df %>%
ggplot(aes(x = Education)) +
geom_bar()
df <- df %>%
mutate(Has_Partner = case_when(
Marital_Status %in% c("Married", "Together") ~ 1,
Marital_Status %in% c("Absurd", "Widow", "YOLO", "Divorced", "Single", "Alone") ~ 0
))
df$Teenhome <- as.integer(df$Teenhome)
df$Kidhome <- as.integer(df$Kidhome)
df$Has_Partner <- as.integer(df$Has_Partner)
df <- df %>%
mutate(Family_Size = Kidhome + Teenhome + Has_Partner)
df <- df %>%
mutate(campaign_participation = ifelse(AcceptedCmp3 + AcceptedCmp1 + AcceptedCmp2 + AcceptedCmp4 + AcceptedCmp5 + Response > 0, 1,0) )
features <- df %>%
select(Age, Has_Partner, Is_Parent, Family_Size, Education, Income, Recency, campaign_participation, Total_Purchaase
)
features %>%
head()
str(features)
## 'data.frame': 2215 obs. of 9 variables:
## $ Age : num 57 60 49 30 33 47 43 29 40 64 ...
## $ Has_Partner : int 0 0 1 1 1 1 0 1 1 1 ...
## $ Is_Parent : num 0 1 0 1 1 1 1 1 1 1 ...
## $ Family_Size : int 0 2 1 2 2 2 1 2 2 3 ...
## $ Education : chr "Graduate" "Graduate" "Graduate" "Graduate" ...
## $ Income : int 58138 46344 71613 26646 58293 62513 55635 33454 30351 5648 ...
## $ Recency : int 58 38 26 26 94 16 34 32 19 68 ...
## $ campaign_participation: num 1 0 0 0 0 0 0 0 1 1 ...
## $ Total_Purchaase : int 1705 28 797 56 449 758 639 170 49 50 ...
## - attr(*, "na.action")= 'omit' Named int [1:24] 11 28 44 49 59 72 91 92 93 129 ...
## ..- attr(*, "names")= chr [1:24] "11" "28" "44" "49" ...
features$Education <- as.integer(factor(features$Education, levels = c("Postgraduate","Graduate", "Undergraduate")))
pca <- prcomp(features, scale = TRUE)
screeplot(pca)
pr.var <- pca$sdev^2
pve <- 100 * pr.var/ sum(pr.var)
par(mfrow = c(1, 2))
plot(pve, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
type = "b")
plot(cumsum(pve), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
type = "b")
3 Principle component is good choice as it contributes to about 69% of the variation and there is an elbow point at 3,
library(plotly)
p <- plot_ly(x = pca$x[,1], y = pca$x[,2], z = pca$x[,3], type = "scatter3d",
mode = "markers") %>%
layout(scene = list(xaxis = list(title = "PC1"), yaxis = list(title = "PC2"),
zaxis = list(title = "PC3")))
# Display the plot
p
fviz_nbclust(pca$x[,1:3], kmeans, method = "wss",k.max=10, nstart=20, iter.max=20) +
geom_vline(xintercept = 4, linetype = 4)+
labs(subtitle = "Elbow method")
gap_kmeans <- clusGap(pca$x[,1:3], kmeans, nstart = 20, K.max = 10, B = 100)
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 110750)
## Warning: did not converge in 10 iterations
plot(gap_kmeans, main = "Gap Statistic: kmeans")
So, 4 seems like a good choice as the values post that do not add much to the curves.
km <- kmeans(pca$x[,1:3], 4)
# Add cluster assignment to the pca object
pca$cluster <- as.factor(km$cluster)
p <- plot_ly(x = pca$x[,1], y = pca$x[,2], z = pca$x[,3], type = "scatter3d",
mode = "markers", color = pca$cluster) %>%
layout(scene = list(xaxis = list(title = "PC1"), yaxis = list(title = "PC2"),
zaxis = list(title = "PC3")))
# Display the plot
p
df <- df %>%
mutate(cluster = as.factor(km$cluster))
ggplot(df, aes(x = cluster)) +
geom_bar(fill = c("#3366CC", "#DC3912", "#FF9900", "#109618")) +
ggtitle("Distribution Of The Clusters")+
theme_minimal()
Evenly distributed size of each cluster
ggplot(df, aes(x = Total_Purchaase, y = Income, color = cluster)) +
geom_point() +
scale_color_manual(values = c("#3366CC", "#DC3912", "#FF9900", "#109618")) +
ggtitle("Cluster's Profile Based On Income And Spending") +
xlab("Total Purchase") +
ylab("Income")+
guides(color = guide_legend(title = "Clusters"))+
theme_minimal()
Green is high income while yellow is low income
ggplot(df, aes(x = cluster, y = Total_Purchaase)) +
geom_point(size = 1,color = "#1f77b7", alpha = 0.5) +
geom_boxplot(aes(fill = cluster), color = "black", outlier.shape = NA) +
scale_fill_manual(values = c("#3366CC", "#DC3912", "#FF9900", "#109618")) +
ggtitle("Cluster's Spending Distribution") +
xlab("Clusters") +
ylab("Spending")
Green have high spending while yellow has low
df %>%
mutate(Total_Promos = AcceptedCmp1 + AcceptedCmp2 + AcceptedCmp3 + AcceptedCmp4 + AcceptedCmp5) %>%
ggplot(aes(x = Total_Promos, fill = factor(cluster))) +
geom_bar(position ="dodge", alpha = 0.8) +
scale_fill_manual(values = c("#3366CC", "#DC3912", "#FF9900", "#109618")) +
ggtitle("Count Of Promotion Accepted") +
xlab("Number Of Total Accepted Promotions") +
ylab("Count")
The later campaign were most appealed to green cluster
ggplot(df, aes(x = factor(cluster), y = NumDealsPurchases, fill = factor(cluster))) +
geom_boxplot(alpha = 0.8) +
scale_fill_manual(values = c("#3366CC", "#DC3912", "#FF9900", "#109618")) +
ggtitle("Number of Deals Purchased") +
xlab("Clusters") +
ylab("Number of Deals Purchased")
green did not get too many deals
df %>%
group_by(cluster) %>%
summarise(Wines = mean(MntWines), Fruits = mean(MntFruits), Meat = mean(MntMeatProducts), Fish = mean(MntFishProducts), Sweet = mean(MntSweetProducts), gold = mean(MntGoldProds)) %>%
ggplot(aes(x = cluster)) +
geom_bar(aes(y = Wines, fill = "Wines"), stat = "identity", alpha = 0.7) +
geom_bar(aes(y = Fruits, fill = "Fruits"), stat = "identity", alpha = 0.7) +
geom_bar(aes(y = Meat, fill = "Meat"), stat = "identity", alpha = 0.7) +
geom_bar(aes(y = Fish, fill = "Fish"), stat = "identity", alpha = 0.7) +
geom_bar(aes(y = Sweet, fill = "Sweet"), stat = "identity", alpha = 0.7) +
geom_bar(aes(y = gold, fill = "Gold"), stat = "identity", alpha = 0.7) +
scale_fill_manual(values = c("Wines" = "#1F77B4", "Fruits" = "#FF7F0E", "Meat" = "#2CA02C", "Fish" = "#D62728", "Sweet" = "#9467BD", "Gold" = "#8C564B")) +
labs(title = "Average Spending on Product Categories by Cluster",
x = "Cluster",
y = "Average Spending",
fill = "Type of Product")+
theme_minimal() +
theme(legend.position = "right")
green consumes more meat
ggplot(df, aes(Teenhome, fill = cluster)) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("#3366CC", "#DC3912", "#FF9900", "#109618")) +
theme_minimal()
Red is a parent for sure while green isn’t
ggplot(df, aes(Kidhome, fill = cluster)) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("#3366CC", "#DC3912", "#FF9900", "#109618")) +
theme_minimal()
Green doesn’t have kid
ggplot(df, aes(Family_Size, fill = cluster)) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("#3366CC", "#DC3912", "#FF9900", "#109618")) +
theme_minimal()
Red and Yellow are definitely have family while green and blue seem they don’t
ggplot(df, aes(Age, fill = cluster)) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("#3366CC", "#DC3912", "#FF9900", "#109618")) +
theme_minimal()
ggplot(df, aes(Education, fill = cluster)) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("#3366CC", "#DC3912", "#FF9900", "#109618")) +
theme_minimal()
No Particular specific conclusion can be drawn
ggplot(df, aes(Has_Partner, fill = cluster)) +
geom_density(alpha = 0.5) +
scale_fill_manual(values = c("#3366CC", "#DC3912", "#FF9900", "#109618")) +
theme_minimal()
Red and Yellow have a partner while blue mostly doesn’t and nothing conclusive could be said about green.
Mostly Single Mostly Not a Parent Age-wise they are somewhere 30-60 Mid to High Income Group
They are parents They have a Partner Average Income Group Are Older
Mostly have a partner They are parent Low Income Group Are Younger than other groups (30-40)
Good Proportion of them have a Partner Are not Parent High Income Group Age-wise they are distributed evenly